ML Analysis of LGG Samples from Liquid Biopsy¶

Author: Shehbeel Arif¶

Preclinical Laboratory Research Unit¶

The Center for Data Driven Discovery in Biomedicine (D3b)¶

Children's Hospital of Philadelphia¶

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from umap.umap_ import UMAP
import plotly.express as px

# PCA
#from sklearn.decomposition import PCA
# Library for t-SNE projections
#from sklearn.manifold import TSNE
2022-09-26 22:27:27.024697: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
In [2]:
# Load metadata
meta = pd.read_csv('lgg_lb_meta.csv')
meta = meta.set_index(['SDG_ID'])
#meta

# Load gene expression matrix
df = pd.read_csv('lb_plasma_matrix.csv')
tdf = df.T
tdf.columns = tdf.iloc[0] 
tdf = tdf[1:]
#tdf

# Merge metadata and gene expression matrix
main_df = pd.concat([meta, tdf], axis=1, join="inner")
#main_df

# Store Sample IDs as a list
sample_ids = main_df.index.to_list()

Using Short Histology¶

In [3]:
short_histology_df = main_df.drop(['Specimen_Type', 'Diagnosis', 'Tumor_Subtype', 'Relapse', 'Survival_Status'],axis=1)
#short_histology_df

Exploring the data¶

In [4]:
# Store histologies
hist = short_histology_df['Short_Histology'].to_list()

# Plot UMAP
umap_2d = UMAP(n_components=2, init='random', random_state=0)
#umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])
#proj_3d = umap_3d.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])

umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
#umap_2d_df

ufig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2', title='UMAP Plot of Short Histology', color=hist, hover_data=['Sample_ID'])

#ufig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=hist)

# ufig_2d.update_xaxes(title_text='UMAP1')
# ufig_2d.update_yaxes(title_text='UMAP2')

ufig_2d.show()
#ufig_3d.show()

# Plot t-SNE
#tsne_2d = TSNE(n_components=2, init='random', random_state=0, learning_rate='auto')
#tsne_3d = TSNE(n_components=3, init='random', random_state=0, learning_rate='auto')

#proj_2d = tsne_2d.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])
#proj_3d = tsne_3d.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])

#tfig_2d = px.scatter(proj_2d, x=0, y=1, title='t-SNE Plot of Short Histology', color=hist)
#tfig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=hist)

#tfig_2d.show()
#tfig_3d.show()

# Perform PCA
#pca2 = PCA(n_components=2)
#pca3 = PCA(n_components=3)

#df_pca2 = pca2.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])
#df_pca3 = pca3.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])

# Display PCA plot
#pca_2d = px.scatter(df_pca2, x=0, y=1, title='PCA Plot of Short Histology', color=hist)

# Display 3D PCA plot
#pca_3d = px.scatter_3d(df_pca3, x=0, y=1, z=2, color=hist)

#pca_2d.show()
#pca_3d.show()

In [5]:
# Split the dataset into features and labels
sh_X = short_histology_df.loc[:, short_histology_df.columns != 'Short_Histology'].values
sh_y = short_histology_df.loc[:, short_histology_df.columns == 'Short_Histology'].values.ravel()
In [6]:
# Split data into training and testing set
sh_X_train, sh_X_test, sh_y_train, sh_y_test = train_test_split(sh_X, sh_y, test_size=0.25, random_state=42)

#Sanity check
print(sh_X_train.shape, sh_X_test.shape, sh_y_train.shape, sh_y_test.shape)
(12, 2102) (5, 2102) (12,) (5,)
In [7]:
# Histology Class Histogram
fig = px.histogram(short_histology_df, x='Short_Histology', title='Histology Classes')

fig.update_xaxes(title_text='Histology')
fig.update_yaxes(title_text='Number of Samples')

fig.show()
In [8]:
# Initialize random forest classifier
sh_rf = RandomForestClassifier(max_depth=2, random_state=0)

# Train the random forest classifier
sh_rf.fit(sh_X_train, sh_y_train)

# Make predictions using random forest classifier
sh_rf_y_pred = sh_rf.predict(sh_X_test)

# Accuracy of model
print(f'Accuracy: {accuracy_score(sh_y_test, sh_rf_y_pred)}')
Accuracy: 0.6
In [9]:
# Calculate a confusion matrix
sh_cm = confusion_matrix(sh_y_test, sh_rf_y_pred, labels=sh_rf.classes_)

# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(sh_cm, text_auto=True,
                labels=dict(x="Predicted Subtype", y="True Subtype", color="Productivity"),
                x=short_histology_df['Short_Histology'].unique().tolist(),
                y=short_histology_df['Short_Histology'].unique().tolist(),
                title='Histology Classification Accuracy'
                )
disp.show()
In [10]:
# What are the most important features?
sh_rf_feature_list = short_histology_df.columns
sh_rf_feature_list = sh_rf_feature_list.drop('Short_Histology')

sh_rf_imp_features = pd.Series(sh_rf.feature_importances_, index=sh_rf_feature_list)

sh_rf_imp_genes = sh_rf_imp_features.sort_values(ascending=False).to_frame().reset_index()
sh_rf_imp_genes.columns = ["features", "importance"]

sh_rf_imp_genes_fil = sh_rf_imp_genes[~(sh_rf_imp_genes == 0.000000).any(axis=1)]
sh_rf_imp_genes_fil
Out[10]:
features importance
0 miR-4685-3p 0.030000
1 miR-374b-3p 0.030000
2 miR-4727-5p 0.020000
3 miR-6722-5p 0.020000
4 miR-6798-5p 0.020000
... ... ...
94 miR-223-5p 0.002857
95 miR-17-3p 0.002857
96 miR-181b-5p 0.002857
97 miR-6872-3p 0.002857
98 miR-6078 0.002857

99 rows × 2 columns

In [11]:
# Display interactive Barplot of important miRNAs
fig = px.bar(sh_rf_imp_genes_fil, x=sh_rf_imp_genes_fil.features, y=sh_rf_imp_genes_fil.importance)
fig.show()

Relapse¶

In [12]:
relapse_df = main_df.drop(['Specimen_Type', 'Diagnosis', 'Short_Histology', 'Tumor_Subtype', 'Survival_Status'],axis=1)
#relapse_df

Exploring the data¶

In [13]:
# Store relapse data
relapse = relapse_df['Relapse'].to_list()

# Plot UMAP
umap_2d = UMAP(n_components=2, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(relapse_df.loc[:,'CTRL_ANT1':])

umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
#umap_2d_df

ufig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2', title='UMAP Plot of Relapse', color=relapse, hover_data=['Sample_ID'])

ufig_2d.show()
In [14]:
# Split the dataset into features and labels
r_X = relapse_df.loc[:, relapse_df.columns != 'Relapse'].values
r_y = relapse_df.loc[:, relapse_df.columns == 'Relapse'].values.ravel()

# Split data into training and testing set
r_X_train, r_X_test, r_y_train, r_y_test = train_test_split(r_X, r_y, test_size=0.25, random_state=42)

#Sanity check
print(r_X_train.shape, r_X_test.shape, r_y_train.shape, r_y_test.shape)
(12, 2102) (5, 2102) (12,) (5,)
In [15]:
# Relapse Classes Histogram
fig = px.histogram(relapse_df, x='Relapse', title='Relapse Classes')

fig.update_xaxes(title_text='Relapse?')
fig.update_yaxes(title_text='Number of Samples')

fig.show()
In [16]:
# Initialize random forest classifier
r_rf = RandomForestClassifier(max_depth=2, random_state=0)

# Train the random forest classifier
r_rf.fit(r_X_train, r_y_train)

# Make predictions using random forest classifier
r_rf_y_pred = r_rf.predict(r_X_test)

# Accuracy of model
print(f'Accuracy: {accuracy_score(r_y_test, r_rf_y_pred)}')
Accuracy: 0.8
In [17]:
# Calculate a confusion matrix
r_cm = confusion_matrix(r_y_test, r_rf_y_pred, labels=r_rf.classes_)

# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(r_cm, text_auto=True,
                labels=dict(x="Predicted Relapse", y="True Relapse", color="Productivity"),
                x=relapse_df['Relapse'].unique().tolist(),
                y=relapse_df['Relapse'].unique().tolist()
                )
disp.show()
In [18]:
print(r_X_test)
print('True Relapse Label:')
print(r_y_test)
print('Predicted Relapse:')
print(r_rf_y_pred)
[[16 16 22 ... 891 75 290]
 [35 60 49 ... 1876 78 662]
 [12 68 43 ... 1020 174 370]
 [45 14 36 ... 8387 128 1737]
 [39 30 21 ... 1508 64 254]]
True Relapse Label:
['Yes' 'Yes' 'No' 'Yes' 'No']
Predicted Relapse:
['Yes' 'Yes' 'No' 'Yes' 'Yes']

The ML algorithm misclassified patient 15635-101 as having a relapse, but this patient did not at the time of plasma collection. This patient should be investigated further for relapse.

How is the Random Forest Model Learning?¶

In [19]:
from sklearn import tree

fn = relapse_df.loc[:, relapse_df.columns != 'Relapse'].columns.to_list()
cn = r_rf.classes_.tolist()
tree.plot_tree(r_rf.estimators_[0], feature_names = fn, class_names=cn, filled = True)
Out[19]:
[Text(0.5, 0.75, 'miR-873-5p <= 1924.0\ngini = 0.278\nsamples = 6\nvalue = [2, 10]\nclass = Yes'),
 Text(0.25, 0.25, 'gini = 0.0\nsamples = 5\nvalue = [0, 10]\nclass = Yes'),
 Text(0.75, 0.25, 'gini = 0.0\nsamples = 1\nvalue = [2, 0]\nclass = No')]

This is Decision Tree #1 out of 100 total decision trees within the Random Forest Classifier.

In [20]:
# What are the most important features?
r_rf_feature_list = relapse_df.columns
r_rf_feature_list = r_rf_feature_list.drop('Relapse')

r_rf_imp_features = pd.Series(r_rf.feature_importances_, index=r_rf_feature_list)

r_rf_imp_genes = r_rf_imp_features.sort_values(ascending=False).to_frame().reset_index()
r_rf_imp_genes.columns = ["features", "importance"]

r_rf_imp_genes_fil = r_rf_imp_genes[~(r_rf_imp_genes == 0.000000).any(axis=1)]
r_rf_imp_genes_fil
Out[20]:
features importance
0 miR-6506-3p 0.030303
1 miR-502-5p 0.020202
2 miR-4677-5p 0.020202
3 miR-19a-5p 0.020202
4 miR-887-5p 0.020202
... ... ...
91 miR-7106-3p 0.006734
92 miR-1976 0.006734
93 miR-7157-3p 0.003367
94 miR-4636 0.003367
95 miR-505-3p 0.002886

96 rows × 2 columns

In [21]:
# Display interactive Barplot of important miRNAs
fig = px.bar(r_rf_imp_genes_fil, x=r_rf_imp_genes_fil.features, y=r_rf_imp_genes_fil.importance)
fig.show()

How good are the ML found genes for histology classification?¶

In [22]:
# Create a dataframe containing only ML selected genes
ml_hist_df = short_histology_df.filter(sh_rf_imp_genes_fil['features'].to_list()[0:1])

umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(ml_hist_df)
proj_3d = umap_3d.fit_transform(ml_hist_df)

umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
umap_3d_df = pd.DataFrame(proj_3d, sample_ids).reset_index()
umap_3d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2', 'UMAP3']

fig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2', 
                    title='UMAP Plot using genes found by Random Forest Algorithm (Histology)', 
                    color=hist, hover_data=['Sample_ID'])
fig_3d = px.scatter_3d(umap_3d_df, x='UMAP1', y='UMAP2', z='UMAP3',
                    title='UMAP 3D Plot using genes found by Random Forest Algorithm (Histology)', 
                    color=hist, hover_data=['Sample_ID'])

fig_2d.show()
fig_3d.show()
In [23]:
print('Name of the single miRNA: ', sh_rf_imp_genes_fil['features'].to_list()[0:1][0])
Name of the single miRNA:  miR-4685-3p

How good are the ML found genes for relapse prediction?¶

In [24]:
# Create a dataframe containing only ML selected genes
ml_relapse_df = relapse_df.filter(r_rf_imp_genes_fil['features'].to_list())

umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(ml_relapse_df)
proj_3d = umap_3d.fit_transform(ml_relapse_df)

umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
umap_3d_df = pd.DataFrame(proj_3d, sample_ids).reset_index()
umap_3d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2', 'UMAP3']

fig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2', 
                    title='UMAP Plot for using all genes found by Random Forest Algorithm (Relapse)', 
                    color=relapse, hover_data=['Sample_ID'])
fig_3d = px.scatter_3d(umap_3d_df, x='UMAP1', y='UMAP2', z='UMAP3',
                    title='UMAP 3D Plot using all genes found by Random Forest Algorithm (Relapse)', 
                    color=relapse, hover_data=['Sample_ID'])

fig_2d.show()
fig_3d.show()

Single miRNA from Relapse selected genes¶

In [25]:
# Create a dataframe containing only ML selected genes
ml_relapse_df = relapse_df.filter(r_rf_imp_genes_fil['features'].to_list()[0:1]) # Also [0:10] (ten genes)

umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(ml_relapse_df)
proj_3d = umap_3d.fit_transform(ml_relapse_df)

umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
umap_3d_df = pd.DataFrame(proj_3d, sample_ids).reset_index()
umap_3d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2', 'UMAP3']

fig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2', 
                    title='UMAP Plot using most important gene found by Random Forest Algorithm (Relapse)', 
                    color=relapse, hover_data=['Sample_ID'])
fig_3d = px.scatter_3d(umap_3d_df, x='UMAP1', y='UMAP2', z='UMAP3',
                    title='UMAP 3D Plot using most important gene found by Random Forest Algorithm (Relapse)', 
                    color=relapse, hover_data=['Sample_ID'])

fig_2d.show()
fig_3d.show()
In [26]:
# The most important gene determined by ML algorithm
print('Name of the single miRNA: ', r_rf_imp_genes_fil['features'].to_list()[0:1][0])
Name of the single miRNA:  miR-6506-3p

Interestingly, patient 15635-101 is being clustered with the relapsed patients based on just miRNA signature.


Export list of genes¶

In [27]:
# Add "hsa-" to all miRNA genes
hist_genes = sh_rf_imp_genes_fil['features'].to_list()
relapse_genes = r_rf_imp_genes_fil['features'].to_list()

hist_genes = ["hsa-" + mirnas for mirnas in hist_genes]
relapse_genes = ["hsa-" + mirnas for mirnas in relapse_genes]
In [28]:
# Export miRNA genes important for distinguishing LGG and HGG brain tumors
with open("LGG_miRNAs.txt", 'w') as file:
        for row in hist_genes:
            s = "".join(map(str, row))
            file.write(s+'\n')

# Export miRNA genes important for relapse in LGG/HGG brain tumors
with open("LGG_relapse_miRNAs.txt", 'w') as file:
        for row in relapse_genes:
            s = "".join(map(str, row))
            file.write(s+'\n')

Comparing with Shiny App results¶

In [29]:
dip_test_genes = ['miR-339-5p', 'miR-1299', 'let-7g-5p', 'miR-145-3p', 'miR-6752-5p', 'miR-708-5p', 
                  'miR-3679-3p','miR-448', 'miR-548ag', 'miR-4796-5p', 'miR-6844', 'miR-5697', 'miR-4765', 
                  'miR-8076', 'miR-1278', 'miR-3606-5p', 'miR-4424', 'miR-2114-3p', 'miR-4677-5p', 'miR-644a', 
                  'miR-6739-3p']
In [30]:
print('Number of miRNA genes found for histology: ', len(sh_rf_imp_genes_fil['features'].to_list()), 'genes')
print('Number of miRNA genes found for relapse: ', len(r_rf_imp_genes_fil['features'].to_list()), 'genes')
print('Number of miRNA genes found using dip test: ', len(dip_test_genes), 'genes')
Number of miRNA genes found for histology:  99 genes
Number of miRNA genes found for relapse:  96 genes
Number of miRNA genes found using dip test:  21 genes

Histology Clustering using Dip Test genes¶

In [31]:
# Create a dataframe containing only dip test genes
dip_histology_df = short_histology_df[dip_test_genes]

# Plot the UMAP Figures
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(dip_histology_df)
proj_3d = umap_3d.fit_transform(dip_histology_df)

umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
umap_3d_df = pd.DataFrame(proj_3d, sample_ids).reset_index()
umap_3d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2', 'UMAP3']

fig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2', 
                    title='UMAP Plot using Dip Test genes (Histology)', 
                    color=hist, hover_data=['Sample_ID'])
fig_3d = px.scatter_3d(umap_3d_df, x='UMAP1', y='UMAP2', z='UMAP3',
                    title='UMAP 3D Plot using Dip Test genes (Histology)', 
                    color=hist, hover_data=['Sample_ID'])

fig_2d.show()
fig_3d.show()
In [32]:
histology_cols = short_histology_df.filter(sh_rf_imp_genes_fil['features'].to_list())
cols = list(set(histology_cols) & set(dip_test_genes))
#print(len(dip_test_genes))
#print(len(cols))
#list(set(cols) ^ set(dip_test_genes))

print('List of Dip Test genes that are also found from ML model for Histology: ')
cols
List of Dip Test genes that are also found from ML model for Histology: 
Out[32]:
['miR-339-5p']

Relapse Clustering using Dip Test genes¶

In [33]:
# Create a dataframe containing only dip test genes
dip_relapse_df = relapse_df[dip_test_genes]

# Plot the UMAP Figures
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(dip_relapse_df)
proj_3d = umap_3d.fit_transform(dip_relapse_df)

umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
umap_3d_df = pd.DataFrame(proj_3d, sample_ids).reset_index()
umap_3d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2', 'UMAP3']

fig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2', 
                    title='UMAP Plot using Dip Test genes (Relapse)', 
                    color=relapse, hover_data=['Sample_ID'])
fig_3d = px.scatter_3d(umap_3d_df, x='UMAP1', y='UMAP2', z='UMAP3',
                    title='UMAP 3D Plot using Dip Test genes (Relapse)', 
                    color=relapse, hover_data=['Sample_ID'])

fig_2d.show()
fig_3d.show()
In [34]:
relapse_cols = relapse_df.filter(r_rf_imp_genes_fil['features'].to_list())
cols = list(set(relapse_cols) & set(dip_test_genes))
#print(len(dip_test_genes))
#print(len(cols))
#list(set(cols) ^ set(dip_test_genes))

print('List of Dip Test genes that are also found from ML model (Relapse): ')
cols
List of Dip Test genes that are also found from ML model (Relapse): 
Out[34]:
['miR-3606-5p', 'miR-4677-5p', 'miR-644a']

Overlap with DE genes (Histology)¶

Check if the miRNA genes found are differentially expressed between LGG and HGG patients, and between relapse and non-relapse patients.

In [35]:
hdf_de = pd.read_csv('lb_plasma_histology_DE.csv')
hdf_de = hdf_de.dropna() # Left with 756 miRNA genes

# Save the DE genes to a list
de_hist_genes = hdf_de[hdf_de['padj']<0.05]['Unnamed: 0'].tolist()

# DE genes
hdf_de[hdf_de['padj']<0.05] # 36 significant DE miRNA genes
Out[35]:
Unnamed: 0 baseMean log2FoldChange lfcSE stat pvalue padj
0 miR-9-3p 369.895025 -3.545678 0.547516 -6.475941 9.422266e-11 7.132655e-08
1 miR-215-5p 776.272909 -3.879022 0.705960 -5.494677 3.914255e-08 1.481545e-05
2 miR-375 3138.874448 -3.055984 0.660167 -4.629108 3.672441e-06 9.266793e-04
3 miR-194-5p 1887.090486 -2.689264 0.625226 -4.301267 1.698240e-05 3.213920e-03
4 miR-4665-5p 1233.004873 1.470567 0.357956 4.108238 3.986887e-05 5.836417e-03
5 miR-4667-5p 3034.388653 1.513389 0.371497 4.073763 4.625958e-05 5.836417e-03
6 miR-339-3p 19679.511317 2.426121 0.603150 4.022416 5.760415e-05 6.229477e-03
7 miR-873-5p 584.103051 1.479436 0.371861 3.978464 6.936194e-05 6.563374e-03
8 miR-4473 381.880103 2.407600 0.630612 3.817878 1.346045e-04 1.132174e-02
9 miR-145-5p 8835.449194 -2.200134 0.597729 -3.680820 2.324850e-04 1.599920e-02
10 miR-192-5p 3898.417422 -2.339416 0.633765 -3.691301 2.231097e-04 1.599920e-02
11 miR-671-5p 7550.662349 1.754070 0.479567 3.657613 2.545752e-04 1.605945e-02
12 miR-4447 579.198702 1.294593 0.368484 3.513297 4.425832e-04 2.435942e-02
13 miR-4685-3p 676.896195 1.408975 0.403649 3.490599 4.819391e-04 2.435942e-02
14 miR-6788-3p 1898.281023 1.677635 0.480672 3.490187 4.826834e-04 2.435942e-02
15 miR-1233-3p 12734.713006 1.438187 0.422884 3.400904 6.716331e-04 2.824590e-02
16 miR-200c-3p 211.510912 -1.746093 0.510905 -3.417649 6.316462e-04 2.824590e-02
17 miR-550a-5p 268.518711 0.856283 0.251547 3.404062 6.639163e-04 2.824590e-02
18 miR-4706 2351.532760 1.095132 0.328881 3.329876 8.688460e-04 3.461665e-02
19 miR-101-5p 212.197838 -0.814384 0.248808 -3.273143 1.063587e-03 3.918903e-02
20 miR-4522 1546.492458 1.310731 0.401210 3.266945 1.087146e-03 3.918903e-02
21 miR-6852-3p 6087.991997 1.532035 0.471781 3.247343 1.164880e-03 4.008245e-02
22 miR-6723-5p 485.680863 1.870328 0.580710 3.220759 1.278517e-03 4.207989e-02
23 miR-143-3p 7497.934558 -1.533629 0.479971 -3.195252 1.397089e-03 4.406652e-02
24 miR-4727-5p 528.776095 0.904792 0.285032 3.174353 1.501707e-03 4.547168e-02
25 miR-4519 811.173449 1.073130 0.342321 3.134866 1.719328e-03 4.820485e-02
26 miR-8069 955.107411 0.942425 0.299746 3.144079 1.666105e-03 4.820485e-02
27 miR-7109-5p 310.853763 0.903019 0.289804 3.115967 1.833428e-03 4.956804e-02
28 miR-1287-5p 21170.276376 1.411937 0.463423 3.046758 2.313240e-03 4.959436e-02
29 miR-200a-3p 195.398108 -0.998384 0.322769 -3.093183 1.980217e-03 4.959436e-02
30 miR-3157-5p 4207.979636 1.227023 0.395784 3.100232 1.933693e-03 4.959436e-02
31 miR-3943 4022.197686 1.450824 0.472285 3.071927 2.126820e-03 4.959436e-02
32 miR-6080 632.156671 1.468667 0.484282 3.032667 2.424031e-03 4.959436e-02
33 miR-6088 440.890616 0.736990 0.240070 3.069893 2.141353e-03 4.959436e-02
34 miR-6869-5p 903.322606 0.967520 0.318780 3.035067 2.404824e-03 4.959436e-02
35 miR-7150 1599.792604 1.350058 0.443170 3.046368 2.316243e-03 4.959436e-02
36 miR-96-5p 218.531065 -1.353603 0.445418 -3.038950 2.374044e-03 4.959436e-02
In [36]:
# None of the dip test genes were differentially expressed
print('Number of dip test and DE genes that overlap (Histology):')
list(set(de_hist_genes) & set(dip_test_genes))
Number of dip test and DE genes that overlap (Histology):
Out[36]:
[]
In [37]:
# 16 ML found genes for short histology were differentially expressed significantly
sh_overlap = list(set(de_hist_genes) & set(sh_rf_imp_genes_fil['features'].to_list()))
print('Number of ML and DE genes that overlap (Histology):')
sh_overlap
Number of ML and DE genes that overlap (Histology):
Out[37]:
['miR-873-5p',
 'miR-7150',
 'miR-550a-5p',
 'miR-3157-5p',
 'miR-6080',
 'miR-4685-3p',
 'miR-200a-3p',
 'miR-1233-3p',
 'miR-3943',
 'miR-6852-3p',
 'miR-6088',
 'miR-4667-5p',
 'miR-4727-5p',
 'miR-8069',
 'miR-194-5p',
 'miR-96-5p']
In [38]:
# Most important gene determined by ML for Histology
print('Most important gene found by ML (Histology):', sh_rf_imp_genes_fil['features'].to_list()[0:1][0])
Most important gene found by ML (Histology): miR-4685-3p

Overlap with DE genes (Relapse)¶

In [39]:
rdf_de = pd.read_csv('lb_plasma_relapse_DE.csv')
rdf_de = rdf_de.dropna() # Left with 1096 miRNA genes
# Store DE genes as a list
de_relapse_genes = rdf_de[rdf_de['padj']<0.05]['Unnamed: 0'].tolist()

rdf_de[rdf_de['padj']<0.05] # 159 significant DE miRNA genes
Out[39]:
Unnamed: 0 baseMean log2FoldChange lfcSE stat pvalue padj
0 miR-148a-3p 2627.979100 2.515368 0.492092 5.111581 3.194742e-07 0.000175
1 miR-194-5p 883.425019 2.034687 0.397943 5.113015 3.170576e-07 0.000175
2 miR-19b-3p 50281.482652 1.821879 0.368146 4.948793 7.467513e-07 0.000273
3 miR-17-5p 15476.879263 1.591049 0.334409 4.757790 1.957240e-06 0.000429
4 miR-6080 1838.423762 -3.425195 0.715219 -4.789017 1.675999e-06 0.000429
... ... ... ... ... ... ... ...
154 miR-3124-5p 545.908489 -1.081474 0.398509 -2.713802 6.651588e-03 0.047076
155 miR-126-3p 30652.890467 1.335901 0.493481 2.707097 6.787448e-03 0.047730
156 miR-4474-3p 83.975902 0.829953 0.307711 2.697179 6.992957e-03 0.048552
157 miR-6741-3p 4287.744769 -1.358001 0.503096 -2.699285 6.948857e-03 0.048552
158 miR-379-5p 132.455091 1.476587 0.548646 2.691330 7.116772e-03 0.049101

159 rows × 7 columns

In [40]:
# 11 ML found genes for recurrence were differentially expressed significantly
print('Number of ML and DE genes that overlap (Relapse):')
r_overlap = list(set(de_relapse_genes) & set(r_rf_imp_genes_fil['features'].to_list()))
r_overlap
Number of ML and DE genes that overlap (Relapse):
Out[40]:
['miR-6762-3p',
 'miR-873-5p',
 'miR-6499-5p',
 'miR-7846-3p',
 'miR-4685-3p',
 'miR-4268',
 'miR-16-5p',
 'miR-671-5p',
 'miR-4432',
 'miR-194-5p',
 'miR-34a-3p']
In [41]:
# Most important gene determined by ML for Relapse
print('Most important gene found by ML (Relapse):', r_rf_imp_genes_fil['features'].to_list()[0:1][0])
Most important gene found by ML (Relapse): miR-6506-3p

In [42]:
# Create a dataframe containing only DE overlap genes for histology
de_histology_df = short_histology_df[de_hist_genes]

# Plot the UMAP Figures
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(de_histology_df)
proj_3d = umap_3d.fit_transform(de_histology_df)

umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
umap_3d_df = pd.DataFrame(proj_3d, sample_ids).reset_index()
umap_3d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2', 'UMAP3']

fig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2', 
                    title='UMAP Plot using DE genes (Histology)', 
                    color=hist, hover_data=['Sample_ID'])
fig_3d = px.scatter_3d(umap_3d_df, x='UMAP1', y='UMAP2', z='UMAP3',
                    title='UMAP 3D Plot using DE genes (Histology)', 
                    color=hist, hover_data=['Sample_ID'])

fig_2d.show()
fig_3d.show()
In [43]:
# Create a dataframe containing only DE overlap genes for histology
de_relapse_df = relapse_df[de_relapse_genes]

# Plot the UMAP Figures
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(de_relapse_df)
proj_3d = umap_3d.fit_transform(de_relapse_df)

umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
umap_3d_df = pd.DataFrame(proj_3d, sample_ids).reset_index()
umap_3d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2', 'UMAP3']

fig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2', 
                    title='UMAP Plot using DE genes (Relapse)', 
                    color=hist, hover_data=['Sample_ID'])
fig_3d = px.scatter_3d(umap_3d_df, x='UMAP1', y='UMAP2', z='UMAP3',
                    title='UMAP 3D Plot using DE genes (Relapse)', 
                    color=hist, hover_data=['Sample_ID'])

fig_2d.show()
fig_3d.show()

Plotting the histology biomarkers¶

In [44]:
# Plot the UMAP Figures
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(short_histology_df[sh_overlap])
proj_3d = umap_3d.fit_transform(short_histology_df[sh_overlap])

umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
umap_3d_df = pd.DataFrame(proj_3d, sample_ids).reset_index()
umap_3d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2', 'UMAP3']

fig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2', 
                    title='UMAP Plot using Histology Biomarkers', 
                    color=hist, hover_data=['Sample_ID'])
fig_3d = px.scatter_3d(umap_3d_df, x='UMAP1', y='UMAP2', z='UMAP3',
                    title='UMAP 3D Plot using Histology Biomarkers', 
                    color=hist, hover_data=['Sample_ID'])

fig_2d.show()
fig_3d.show()

Plotting the relapse biomarkers¶

In [45]:
# Plot the UMAP Figures
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(relapse_df[r_overlap])
proj_3d = umap_3d.fit_transform(relapse_df[r_overlap])

umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
umap_3d_df = pd.DataFrame(proj_3d, sample_ids).reset_index()
umap_3d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2', 'UMAP3']

fig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2', 
                    title='UMAP Plot using Relapse Biomarkers', 
                    color=hist, hover_data=['Sample_ID'])
fig_3d = px.scatter_3d(umap_3d_df, x='UMAP1', y='UMAP2', z='UMAP3',
                    title='UMAP 3D Plot using Relapse Biomarkers', 
                    color=hist, hover_data=['Sample_ID'])

fig_2d.show()
fig_3d.show()

Ammar's list of genes¶

In [46]:
# Ammar's list of genes
ammar = ['miR-9-5p', 'miR-124-3p', 'miR-137', 'miR-194-5p', 'miR-340-5p']

# Create a dataframe containing only those genes
ammar_df = main_df.filter(ammar)

umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(ammar_df)
proj_3d = umap_3d.fit_transform(ammar_df)

umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
umap_3d_df = pd.DataFrame(proj_3d, sample_ids).reset_index()
umap_3d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2', 'UMAP3']

fig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2', 
                    title='UMAP using MIR9‐5p, MIR124‐3p, MIR137, MIR194‐5p, and MIR340‐5p (Histology)', 
                    color=main_df['Short_Histology'].tolist(), hover_data=['Sample_ID'])
fig_3d = px.scatter_3d(umap_3d_df, x='UMAP1', y='UMAP2', z='UMAP3',
                    title='3D UMAP using MIR9‐5p, MIR124‐3p, MIR137, MIR194‐5p, and MIR340‐5p (Histology)', 
                    color=main_df['Short_Histology'].tolist(), hover_data=['Sample_ID'])

fig_2d.show()
fig_3d.show()